library(limma)
library(edgeR)
library(biomaRt)
library(ggplot2)
library(ggrepel)
library(tidyr)
library(RUVSeq)
library(clusterProfiler)
library(here)
## here() starts at /projects/imb-pkbphil/sp/rnaseq/six_donor_trans/splicing_paper
i_am("R/six_donor_DGE_limma.Rmd")
## here() starts at /projects/imb-pkbphil/sp/rnaseq/six_donor_trans/splicing_paper
knitr::opts_chunk$set(echo = TRUE, dev = c("pdf"), fig.path = "six_donor_DGE_limma_plots/")
Load experiment 1; select relevant samples
la = read.delim(here("02featureCounts/late_adipo_s4_s6.nomm.feature.counts"), skip=1)
colnames(la) = gsub("output.01hisat.","",gsub(".sorted.bam","", colnames(la)))
# select the bulk day15 samples
la = la[,c(1:6,grep("^(19|2[01234])", colnames(la)))]
head(la)
## Geneid Chr
## 1 ENSG00000223972.5 chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1
## 2 ENSG00000227232.5 chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1
## 3 ENSG00000278267.1 chr1
## 4 ENSG00000243485.5 chr1;chr1;chr1;chr1;chr1
## 5 ENSG00000284332.1 chr1
## 6 ENSG00000237613.2 chr1;chr1;chr1;chr1;chr1
## Start
## 1 11869;12010;12179;12613;12613;12975;13221;13221;13453
## 2 14404;15005;15796;16607;16858;17233;17606;17915;18268;24738;29534
## 3 17369
## 4 29554;30267;30564;30976;30976
## 5 30366
## 6 34554;35245;35277;35721;35721
## End
## 1 12227;12057;12227;12721;12697;13052;13374;14409;13670
## 2 14501;15038;15947;16765;17055;17368;17742;18061;18366;24891;29570
## 3 17436
## 4 30039;30667;30667;31109;31097
## 5 30503
## 6 35174;35481;35481;36073;36081
## Strand Length 19.21423_S40 20.21424_S47 21.21425_S54
## 1 +;+;+;+;+;+;+;+;+ 1735 0 0 0
## 2 -;-;-;-;-;-;-;-;-;-;- 1351 5 15 27
## 3 - 68 6 6 5
## 4 +;+;+;+;+ 1021 0 0 0
## 5 + 138 0 0 0
## 6 -;-;-;-;- 1219 1 2 1
## 22.21426_S3 23.21427_S10 24.21428_S17
## 1 0 0 0
## 2 13 13 37
## 3 8 3 9
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
Load experiment 2; select relevant samples
fc = read.delim(here("02featureCounts/beige_rnaseq.nomm.feature.counts"), skip=1)
colnames(fc) = gsub("output.01hisat.","",gsub(".sorted.bam","", colnames(fc)))
head(fc)[1:9]
## Geneid Chr
## 1 ENSG00000223972.5 chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1
## 2 ENSG00000227232.5 chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1
## 3 ENSG00000278267.1 chr1
## 4 ENSG00000243485.5 chr1;chr1;chr1;chr1;chr1
## 5 ENSG00000284332.1 chr1
## 6 ENSG00000237613.2 chr1;chr1;chr1;chr1;chr1
## Start
## 1 11869;12010;12179;12613;12613;12975;13221;13221;13453
## 2 14404;15005;15796;16607;16858;17233;17606;17915;18268;24738;29534
## 3 17369
## 4 29554;30267;30564;30976;30976
## 5 30366
## 6 34554;35245;35277;35721;35721
## End
## 1 12227;12057;12227;12721;12697;13052;13374;14409;13670
## 2 14501;15038;15947;16765;17055;17368;17742;18061;18366;24891;29570
## 3 17436
## 4 30039;30667;30667;31109;31097
## 5 30503
## 6 35174;35481;35481;36073;36081
## Strand Length 1.22589_S146 2.22590_S149 3.22591_S154
## 1 +;+;+;+;+;+;+;+;+ 1735 0 0 0
## 2 -;-;-;-;-;-;-;-;-;-;- 1351 323 193 171
## 3 - 68 45 39 26
## 4 +;+;+;+;+ 1021 0 0 0
## 5 + 138 0 0 0
## 6 -;-;-;-;- 1219 0 0 0
Match experiments
nrow(la); nrow(fc)
## [1] 60668
## [1] 60668
counts = merge(fc, la, by=colnames(fc)[1:6])
#head(counts)
colnames(counts)
## [1] "Geneid" "Chr" "Start" "End"
## [5] "Strand" "Length" "1.22589_S146" "2.22590_S149"
## [9] "3.22591_S154" "4.22592_S128" "5.22593_S132" "6.22594_S136"
## [13] "7.22595_S140" "8.22596_S143" "9.22597_S147" "10.22598_S150"
## [17] "11.22599_S155" "12.22600_S129" "13.22601_S133" "14.22602_S137"
## [21] "15.22603_S141" "16.22604_S144" "17.22605_S148" "18.22606_S151"
## [25] "19.22607_S156" "20.22608_S130" "21.22609_S134" "22.22610_S138"
## [29] "23.22611_S142" "24.22612_S145" "25.22613_S153" "26.22614_S152"
## [33] "27.22615_S157" "28.22616_S131" "29.22617_S135" "30.22618_S139"
## [37] "19.21423_S40" "20.21424_S47" "21.21425_S54" "22.21426_S3"
## [41] "23.21427_S10" "24.21428_S17"
nrow(counts)
## [1] 60668
Save this to file for GEO submission later. [Reanalysis of s4/s6 white]
write.table(counts, here( "02featureCounts/six_donors.merged.nomm.counts"), sep="\t", quote=F, row.names = F)
formatting
counts$Geneid = gsub("\\..*", "", counts$Geneid)
counts[counts$Geneid == "ENSG00000132170",7:ncol(counts)]# double check gene matching
## 1.22589_S146 2.22590_S149 3.22591_S154 4.22592_S128 5.22593_S132
## 6545 11069 13502 17008 7451 14564
## 6.22594_S136 7.22595_S140 8.22596_S143 9.22597_S147 10.22598_S150
## 6545 8291 11652 14165 17663 20856
## 11.22599_S155 12.22600_S129 13.22601_S133 14.22602_S137 15.22603_S141
## 6545 12708 10109 16121 17043 12563
## 16.22604_S144 17.22605_S148 18.22606_S151 19.22607_S156 20.22608_S130
## 6545 7407 8276 16290 10702 12934
## 21.22609_S134 22.22610_S138 23.22611_S142 24.22612_S145 25.22613_S153
## 6545 13346 10671 9431 12668 12939
## 26.22614_S152 27.22615_S157 28.22616_S131 29.22617_S135 30.22618_S139
## 6545 7333 5670 8552 5035 9837
## 19.21423_S40 20.21424_S47 21.21425_S54 22.21426_S3 23.21427_S10
## 6545 10822 12847 13837 8180 16506
## 24.21428_S17
## 6545 18386
Load info
info = readxl::read_xlsx(here("sample_info/publication_ids.xlsx"))
info$frac_assigned_to_genes = as.double(gsub("%","", info$percent_assigned_to_genes))/100
info$donor.condition = paste(info$donor, info$condition, sep=".")
head(info)
## # A tibble: 6 × 11
## sample time rep donor condition name dataset fc_file
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 19-21423_S40 day15 1 subject4 white day15_subject4_wh… late_a… beige_…
## 2 20-21424_S47 day15 2 subject4 white day15_subject4_wh… late_a… beige_…
## 3 21-21425_S54 day15 3 subject4 white day15_subject4_wh… late_a… beige_…
## 4 22-21426_S3 day15 1 subject6 white day15_subject6_wh… late_a… beige_…
## 5 23-21427_S10 day15 2 subject6 white day15_subject6_wh… late_a… beige_…
## 6 24-21428_S17 day15 3 subject6 white day15_subject6_wh… late_a… beige_…
## # ℹ 3 more variables: percent_assigned_to_genes <chr>,
## # frac_assigned_to_genes <dbl>, donor.condition <chr>
counts$Geneid = gsub("\\..*", "", counts$Geneid)
counts[counts$Geneid == "ENSG00000132170",7:ncol(counts)]# double check gene matching
## 1.22589_S146 2.22590_S149 3.22591_S154 4.22592_S128 5.22593_S132
## 6545 11069 13502 17008 7451 14564
## 6.22594_S136 7.22595_S140 8.22596_S143 9.22597_S147 10.22598_S150
## 6545 8291 11652 14165 17663 20856
## 11.22599_S155 12.22600_S129 13.22601_S133 14.22602_S137 15.22603_S141
## 6545 12708 10109 16121 17043 12563
## 16.22604_S144 17.22605_S148 18.22606_S151 19.22607_S156 20.22608_S130
## 6545 7407 8276 16290 10702 12934
## 21.22609_S134 22.22610_S138 23.22611_S142 24.22612_S145 25.22613_S153
## 6545 13346 10671 9431 12668 12939
## 26.22614_S152 27.22615_S157 28.22616_S131 29.22617_S135 30.22618_S139
## 6545 7333 5670 8552 5035 9837
## 19.21423_S40 20.21424_S47 21.21425_S54 22.21426_S3 23.21427_S10
## 6545 10822 12847 13837 8180 16506
## 24.21428_S17
## 6545 18386
rownames(info) = gsub("-",".", info$sample)
## Warning: Setting row names on a tibble is deprecated.
info = info[colnames(counts)[7:ncol(counts)],]#make sure order is the same
summary(rownames(info) == colnames(counts)[7:ncol(counts)])
## Mode FALSE
## logical 36
ob = DGEList(counts = data.matrix(counts[7:ncol(counts)]),
samples = info,
group = info$donor.condition,
genes = counts[c(1,6)])
rownames(ob$counts) = ob$genes$Geneid
#head(ob)
summary(ob$counts[grep("ENSG00000132170", rownames(ob$counts)),c(1,5:ncol(ob$counts))]) ##PPARG check
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5035 9431 12563 12075 14165 20856
plotMDS(ob, top = 5000, labels =ob$samples$group, col = c(1:12)[ob$samples$group])
library(RColorBrewer)
to_col = colorRampPalette(c("blue","red"))(25)
plotMDS(ob, top = 5000, labels =ob$samples$percent_assigned_to_genes, col = c(1:12)[ob$samples$group])
plotMDS(ob, top = 5000, labels =ob$samples$group, col = c(1:12)[ob$samples$group],
dim.plot = c(3,4))
plotMDS(ob, top = 5000, labels =ob$samples$group, col = c(1:12)[ob$samples$group],
dim.plot = c(5,6))
plotMDS(ob, top = 5000, labels =ob$samples$percent_assigned_to_genes, col = c(1:12)[ob$samples$group],
dim.plot = c(5,6))
plotRLE(ob$counts, outline=FALSE, col=ob$samples$group, main="Before Filtering")
plotSmear(ob, main = "Before Filtering")
plotSmear(ob, main = "Before Filtering", pair=c("subject6.beige","subject6.white"))
ob$samples$group = as.factor(ob$samples$condition)
plotSmear(ob, main = "Before Filtering: White vs Beige")